# Numeric Univariate Analysis 

# Importing the Data

#install.packages(c("FactoMineR", "factoextra"))
1 Inspecting the Data

– Number of rows, Columns
– Variables - type, Values

## Rows: 27
## Columns: 13
## $ X100m        <dbl> 11.04, 10.76, 11.02, 11.34, 11.13, 10.83, 11.64, 11.37, 1…
## $ Long.jump    <dbl> 7.58, 7.40, 7.23, 7.09, 7.30, 7.31, 6.81, 7.56, 6.97, 7.2…
## $ Shot.put     <dbl> 14.83, 14.26, 14.25, 15.19, 13.48, 13.76, 14.57, 14.41, 1…
## $ High.jump    <dbl> 2.07, 1.86, 1.92, 2.10, 2.01, 2.13, 1.95, 1.86, 1.95, 1.9…
## $ X400m        <dbl> 49.81, 49.37, 48.93, 50.42, 48.62, 49.91, 50.14, 51.10, 4…
## $ X110m.hurdle <dbl> 14.69, 14.05, 14.99, 15.31, 14.17, 14.38, 14.93, 15.06, 1…
## $ Discus       <dbl> 43.75, 50.72, 40.87, 46.26, 45.67, 44.41, 47.60, 44.99, 4…
## $ Pole.vault   <dbl> 5.02, 4.92, 5.32, 4.72, 4.42, 4.42, 4.92, 4.82, 4.72, 4.6…
## $ Javeline     <dbl> 63.19, 60.15, 62.77, 63.44, 55.37, 56.37, 52.33, 57.19, 5…
## $ X1500m       <dbl> 291.70, 301.50, 280.10, 276.40, 268.00, 285.10, 262.10, 2…
## $ Rank         <int> 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7,…
## $ Points       <int> 8217, 8122, 8067, 8036, 8004, 7995, 7802, 7733, 7708, 765…
## $ Competition  <fct> Decastar, Decastar, Decastar, Decastar, Decastar, Decasta…

2 Random sample of the dataframe

3 Summary of all the variables of the dataframe

##      X100m         Long.jump        Shot.put       High.jump    
##  Min.   :10.44   Min.   :6.800   Min.   :12.68   Min.   :1.860  
##  1st Qu.:10.84   1st Qu.:7.210   1st Qu.:14.17   1st Qu.:1.930  
##  Median :10.97   Median :7.310   Median :14.57   Median :1.980  
##  Mean   :10.99   Mean   :7.365   Mean   :14.54   Mean   :1.998  
##  3rd Qu.:11.13   3rd Qu.:7.545   3rd Qu.:15.01   3rd Qu.:2.080  
##  Max.   :11.64   Max.   :7.960   Max.   :16.36   Max.   :2.150  
##      X400m        X110m.hurdle       Discus        Pole.vault   
##  Min.   :46.81   Min.   :13.97   Min.   :37.92   Min.   :4.400  
##  1st Qu.:48.70   1st Qu.:14.15   1st Qu.:42.27   1st Qu.:4.660  
##  Median :49.20   Median :14.34   Median :44.72   Median :4.900  
##  Mean   :49.31   Mean   :14.50   Mean   :44.85   Mean   :4.836  
##  3rd Qu.:49.86   3rd Qu.:14.87   3rd Qu.:46.93   3rd Qu.:5.000  
##  Max.   :51.16   Max.   :15.67   Max.   :51.65   Max.   :5.400  
##     Javeline         X1500m           Rank            Points       Competition
##  Min.   :50.31   Min.   :262.1   Min.   : 1.000   Min.   :7313   Decastar:13  
##  1st Qu.:55.32   1st Qu.:271.6   1st Qu.: 4.000   1st Qu.:8000   OlympicG:14  
##  Median :57.19   Median :278.1   Median : 7.000   Median :8084                
##  Mean   :58.32   Mean   :278.5   Mean   : 7.444   Mean   :8119                
##  3rd Qu.:62.05   3rd Qu.:283.6   3rd Qu.:10.500   3rd Qu.:8236                
##  Max.   :70.52   Max.   :301.5   Max.   :19.000   Max.   :8893
plotUniCat <- function(df, x) {
  x <- sym(x)
  df %>%
    filter(!is.na(!!x)) %>%
    count(!!x) %>%
    mutate(prop = prop.table(n)) %>%
    ggplot(aes(y=prop, x=!!x)) +
    geom_bar(stat = "identity")

4 Checking the column names of the dataframe

##  [1] "X100m"        "Long.jump"    "Shot.put"     "High.jump"    "X400m"       
##  [6] "X110m.hurdle" "Discus"       "Pole.vault"   "Javeline"     "X1500m"      
## [11] "Rank"         "Points"       "Competition"

5 Inspecting the structure of the dataframe

## 'data.frame':    27 obs. of  13 variables:
##  $ X100m       : num  11 10.8 11 11.3 11.1 ...
##  $ Long.jump   : num  7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
##  $ Shot.put    : num  14.8 14.3 14.2 15.2 13.5 ...
##  $ High.jump   : num  2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
##  $ X400m       : num  49.8 49.4 48.9 50.4 48.6 ...
##  $ X110m.hurdle: num  14.7 14.1 15 15.3 14.2 ...
##  $ Discus      : num  43.8 50.7 40.9 46.3 45.7 ...
##  $ Pole.vault  : num  5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
##  $ Javeline    : num  63.2 60.1 62.8 63.4 55.4 ...
##  $ X1500m      : num  292 302 280 276 268 ...
##  $ Rank        : int  1 2 4 5 7 8 9 10 11 12 ...
##  $ Points      : int  8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
##  $ Competition : Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...

6 Readying the Data for univariate distributions plotting of numeric variables

data_num <- decathlon2 %>% select_if(is.numeric)
## 'data.frame':    27 obs. of  12 variables:
##  $ X100m       : num  11 10.8 11 11.3 11.1 ...
##  $ Long.jump   : num  7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
##  $ Shot.put    : num  14.8 14.3 14.2 15.2 13.5 ...
##  $ High.jump   : num  2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
##  $ X400m       : num  49.8 49.4 48.9 50.4 48.6 ...
##  $ X110m.hurdle: num  14.7 14.1 15 15.3 14.2 ...
##  $ Discus      : num  43.8 50.7 40.9 46.3 45.7 ...
##  $ Pole.vault  : num  5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
##  $ Javeline    : num  63.2 60.1 62.8 63.4 55.4 ...
##  $ X1500m      : num  292 302 280 276 268 ...
##  $ Rank        : int  1 2 4 5 7 8 9 10 11 12 ...
##  $ Points      : int  8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
variables <- colnames(data_num)
out <- lapply(variables, function(i) plotUniCat(decathlon2,i))

7 Creating histograms for the columns in the dataframe

par(mfrow=c(4, 3))
for (i in names(data_num)){ 
  hist(data_num[, i],xlab = (i))}

8 Creating histogram (frequency) for all the columns in the dataframe

par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i], main = paste0(i), freq=TRUE, xlab= paste0(i), ylim = c(0,20),ylab = "frequency")}

9 Creating histogram (frequency) for all the columns in the dataframe

par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i], main = paste0(i), freq=TRUE, xlab= paste0(i))}

10 Creating density plot for all the columns in the dataframe

par(mfrow=c(4, 3))
for (i in names(data_num)){
plot(density(data_num[, i]), main = paste0(i), xlab= paste0(i))

11 Bivariate Relationships and Correlation plots

pairs.panels(data_num, col="red")

#methods(class = class(decathlon2[,'Competition']))
methods(class = 'factor')
##  [1] [             [[            [[<-          [<-           all.equal    
##  [6] as.character  as.data.frame as.Date       as.list       as.logical   
## [11] as.POSIXlt    as.vector     c             coerce        droplevels   
## [16] format        initialize    is.na<-       length<-      levels<-     
## [21] Math          Ops           plot          print         recode       
## [26] relevel       relist        rep           scale_type    show         
## [31] slotsFromS3   summary       Summary       type_sum      xtfrm        
## see '?methods' for accessing help and source code
## [1] "Decastar" "OlympicG"
## [1] 2
## Decastar OlympicG 
##       13       14

#Correlation Matrix with GGally

# Check correlations (as scatterplots), distribution and print corrleation coefficient 
ggpairs(data_num, title="correlogram with ggpairs()") 

# Nice visualization of correlations
ggcorr(data_num, method = c("everything", "pearson"))  

# https://www.r-graph-gallery.com/199-correlation-matrix-with-ggally.html
# Quick display of two cabapilities of GGally, to assess the distribution and correlation of variables 
# From the help page:
ggpairs(flea, columns = 2:4, ggplot2::aes(colour=species)) 

ggpairs(decathlon2, columns = 1:12, ggplot2::aes(colour=Competition))